library(tidyverse)
library(knitr)
library(plotly) ; library(viridis) ; library(gridExtra) ; library(RColorBrewer)
library(biomaRt)
library(Rtsne)
library(knitr)
library(ROCR)
library(expss)

SFARI_colour_hue = function(r) {
  pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}

Load and prepare data


Load dataset (preprocessing code in 20_04_07_create_dataset.html)

clustering_selected = 'DynamicHybridMergedSmall'
print(paste0('Using clustering ', clustering_selected))
## [1] "Using clustering DynamicHybridMergedSmall"
# Dataset created with DynamicTreeMerged algorithm
dataset = read.csv(paste0('./../Data/dataset_', clustering_selected, '.csv'), row.names=1)

# Add gene symbol
getinfo = c('ensembl_gene_id','external_gene_id')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl',
               host='feb2014.archive.ensembl.org') ## Gencode v19
gene_names = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=rownames(dataset), mart=mart)

rm(getinfo, mart)

Gene filtering:


  • Remove genes without cluster (Module=gray)
rm_cluster = dataset[is.na(dataset$MTcor),'Module'] %>% unique %>% as.character

print(paste0('Removing ', sum(dataset$Module=='gray'), ' genes without cluster'))
## [1] "Removing 113 genes without cluster"
new_dataset = dataset %>% filter(Module != 'gray' & !is.na(MTcor))

Variable changes:


  • Using Module Membership variables instead of binary module membership

  • Not including p-value variables

  • Including a new variable with the absolute value of GS

  • Removing information from gray module (unclassified genes)

  • Objective variable: Binary label indicating if it’s in the SFARI dataset or not

new_dataset = new_dataset %>% dplyr::select(-c(matches(paste('pval|Module')), MMgray)) %>%
              mutate('absGS'=abs(GS), 'SFARI'=ifelse(gene.score=='None', FALSE, TRUE)) %>%
              dplyr::select(-gene.score)

rownames(new_dataset) = rownames(dataset)[dataset$Module != 'gray']

rm(rm_cluster)
original_dataset = dataset
dataset = new_dataset
print(paste0('The final dataset contains ', nrow(dataset), ' observations and ', ncol(dataset), ' variables'))
## [1] "The final dataset contains 16034 observations and 34 variables"
rm(new_dataset)

Sample of the dataset (transposed so it’s easier to see)

dataset %>% head(5) %>% t %>% kable
ENSG00000000003 ENSG00000000419 ENSG00000000457 ENSG00000000460 ENSG00000000938
MTcor 0.5704012 0.2525473 -0.9514071 -0.8039747 0.8771832
GS 0.3866600 0.0319742 -0.3948272 -0.2425891 0.4206897
MM.B79F00 -0.3421536 -0.0685993 0.1822384 -0.0203122 0.0401878
MM.FE6E8A -0.0815013 -0.2406456 -0.0314884 -0.1149264 0.2532996
MM.00BF7D -0.1457984 -0.1570793 0.2895552 0.2146387 -0.1582971
MM.D376FF -0.1558365 -0.3275203 0.2216473 0.1076922 -0.1267206
MM.00B7E9 -0.2706076 0.1086927 0.2716955 0.3287583 -0.1127668
MM.00BD5F -0.2103137 -0.0739132 0.1588410 0.1164722 -0.1165457
MM.00BA38 -0.2073096 -0.2063582 0.1423063 -0.0137954 -0.2407892
MM.FF62BC 0.0881871 -0.0729658 -0.0536479 -0.1519834 0.0208935
MM.619CFF -0.0278009 -0.3888226 0.1529542 0.0699100 -0.0311339
MM.EF7F49 -0.0743495 -0.3953383 0.1857405 -0.1853277 -0.0511009
MM.F8766D 0.0320566 -0.2755247 -0.0473936 -0.2555324 -0.0717451
MM.00C0AF -0.3240023 -0.0804812 0.4041119 0.1597938 -0.4479131
MM.B983FF -0.1398423 -0.1306399 0.2955196 0.0852169 -0.3261794
MM.F564E3 -0.0472562 -0.0021517 0.1666066 -0.0218248 -0.2134750
MM.FD61D1 -0.2102549 0.3544128 0.2690348 0.4502318 -0.4636712
MM.8AAB00 0.1951790 0.3820330 -0.0471786 0.2444100 -0.1238171
MM.C99800 0.1463015 0.5008550 -0.0892625 0.3573500 -0.0912000
MM.00BFC4 -0.0419816 0.0223852 -0.0618101 -0.0495563 0.2941168
MM.E58700 0.1436711 -0.0157220 -0.3400724 -0.2724479 0.1215078
MM.FF67A4 0.2300837 -0.1215257 -0.1906585 -0.3770794 0.1655206
MM.00A7FF -0.1508215 0.1640578 0.0744979 0.1972789 -0.1825727
MM.6BB100 -0.1412895 0.0441975 0.0625034 0.2417779 0.0799198
MM.00C097 0.1450441 0.0257528 -0.2382952 -0.0953965 0.2989640
MM.39B600 0.3292611 -0.0759346 -0.3095235 -0.3958098 0.2579894
MM.9590FF 0.2281574 0.1383876 -0.2008810 0.0354509 0.3490105
MM.A3A500 0.2008548 0.0853587 -0.1824296 -0.1208052 0.2633420
MM.00BCD8 0.0804575 -0.0263322 -0.2410231 0.0801394 0.0722277
MM.E76BF3 0.4768465 -0.1831555 -0.1360703 -0.2163761 0.1917615
MM.00B0F6 0.3884997 0.1582543 -0.3053931 -0.2093428 0.3554336
MM.D89000 0.1285911 0.1002678 -0.1304555 0.1913294 0.0942191
absGS 0.3866600 0.0319742 0.3948272 0.2425891 0.4206897
SFARI 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000

Objective variable distribution: Unbalanced labels

print(table(dataset$SFARI))
## 
## FALSE  TRUE 
## 15137   897
cat(paste0('\n',round(mean(dataset$SFARI)*100,2), '% of the observations are positive'))
## 
## 5.59% of the observations are positive

Visualisations

Visualising the variables

Chose the t-SNE algorithm because it preserves distances

The SFARI labels is still close to the absolute value of Gene Significance. This time the MM variables seem to be grouped in 2 clusters

tsne = dataset %>% t %>% Rtsne(perplexity=10)

plot_data = data.frame('ID'=colnames(dataset), 'C1'=tsne$Y[,1], 'C2'=tsne$Y[,2],
                       type=ifelse(grepl('MM', colnames(dataset)),'ModMembership',
                            ifelse(grepl('SFARI', colnames(dataset)), 'SFARI',
                            ifelse(grepl('GS', colnames(dataset)), 'GS', 'MTcor'))))

ggplotly(plot_data %>% ggplot(aes(C1, C2, color=type)) + geom_point(aes(id=ID)) + 
         theme_minimal() + ggtitle('t-SNE visualisation of variables'))

The Module Membership variables are grouped by Module-Trait correlation, with positive correlations on one side, negative on the other, and modules with low correlation far away from the SFARI tag

mtcor_by_module = original_dataset %>% dplyr::select(Module, MTcor) %>% unique
colnames(mtcor_by_module) = c('ID','MTcor')

plot_data = mtcor_by_module %>% mutate(ID = gsub('#','MM.',ID)) %>% right_join(plot_data, by='ID')

ggplotly(plot_data %>% ggplot(aes(C1, C2, color=MTcor)) + geom_point(aes(id=ID)) + 
         scale_color_viridis() + theme_minimal() + 
         ggtitle('t-SNE of variables coloured by Module-Diagnosis correlation'))
rm(mtcor_by_module, tsne)

Visualising the observations

  • The two main patterns that seem to characterise the genes are their Gene Significance and the Module-Diagnosis correlation of their corresponding module

  • Mean Expression doesn’t seem to play an important role

  • I don’t know what the 2nd PC is capturing

# Mean Expression data
load('./../Data/preprocessed_data.RData')
## Registered S3 methods overwritten by 'Hmisc':
##   method                 from 
##   [.labelled             expss
##   print.labelled         expss
##   as.data.frame.labelled expss
datExpr = datExpr %>% data.frame
mean_expr = data.frame('ID'=rownames(datExpr), 'meanExpr' = rowMeans(datExpr))

# PCA
pca = dataset %>% t %>% prcomp

plot_data = data.frame('ID'=rownames(dataset), 'PC1'=pca$rotation[,1], 'PC2'=pca$rotation[,2], 
                       'SFARI'=dataset$SFARI, 'MTcor'=dataset$MTcor, 'GS'=dataset$GS) %>%
            mutate(alpha=ifelse(SFARI, 0.7, 0.2)) %>% left_join(mean_expr, by='ID')

p1 = plot_data %>% ggplot(aes(PC1, PC2, color=MTcor)) + geom_point(alpha=0.4) + scale_color_viridis() + 
     theme_minimal() + ggtitle('Genes coloured by Module-Diagnosis correlation') +
     xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1]),'%)')) +
     ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2]),'%)')) +
     theme(legend.position='bottom')

p2 = plot_data %>% ggplot(aes(PC1, PC2, color=GS)) + geom_point(alpha=0.4) + scale_color_viridis() + 
     theme_minimal() + ggtitle('Genes coloured by Gene Significance') + theme(legend.position='bottom')

p3 = plot_data %>% ggplot(aes(PC1, PC2, color=SFARI)) + geom_point(alpha = plot_data$alpha) +
     theme_minimal() + ggtitle('Genes coloured by SFARI label') + theme(legend.position='bottom')
p3 = ggExtra::ggMarginal(p3, type='density', groupColour=TRUE, size=10)

p4 = plot_data %>% ggplot(aes(PC1, PC2, color=meanExpr)) + geom_point(alpha=0.4) + scale_color_viridis() + 
     theme_minimal() + ggtitle('Genes coloured by mean level of expression') + theme(legend.position='bottom')

grid.arrange(p1, p2, p3, p4, nrow=2)

rm(pca, datExpr, datGenes, datMeta, dds, DE_info, mean_expr, p1, p2, p3, p4)

Resampling to reduce class imbalance


For now, will do this using over- and under-sampling of the classes, but later on should check SMOTE (Synthetic Minority Over-sampling Technique) method

Need to divide first into train and test sets to keep the sets independent: using 80% of the Positive observations on the training set

positive_idx = which(dataset$SFARI)
negative_idx = which(!dataset$SFARI)

set.seed(1234)
positive_test_idx = sort(sample(positive_idx, size=floor(0.2*length(positive_idx))))
positive_train_idx = positive_idx[!positive_idx %in% positive_test_idx]

set.seed(1234)
negative_test_idx = sort(sample(negative_idx, size=floor(0.2*length(positive_idx))))
negative_train_idx = negative_idx[!negative_idx %in% negative_test_idx]

train_set = dataset[c(positive_train_idx,negative_train_idx),]
test_set = dataset[c(positive_test_idx,negative_test_idx),]

rm(positive_idx, negative_idx, positive_train_idx, positive_test_idx, negative_train_idx, negative_test_idx)

Balancing the dataset to obtain a 1:1 ratio in labels

Over-sampling observations with positive SFARI label: Sample with replacement 4x original number of observations

Sample with replacement positive observations in train set

positive_obs = which(train_set$SFARI)

set.seed(1234)
add_obs = sample(positive_obs, size=3*length(positive_obs), replace=TRUE)

train_set = train_set[c(1:nrow(train_set), add_obs),]

rm(positive_obs, add_obs)

Under-sampling observations with negative SFARI labels

print(paste0('Keeping ~',round(100*sum(train_set$SFARI)/sum(!train_set$SFARI)),
             '% of the Negative observations in the training set'))
## [1] "Keeping ~19% of the Negative observations in the training set"
negative_obs = which(!train_set$SFARI)
set.seed(1234)
remove_obs = sample(negative_obs, size=(sum(!train_set$SFARI)-sum(train_set$SFARI)))

train_set = train_set[-remove_obs,]


rm(negative_obs, remove_obs)

Label distribution in training set

cro(train_set$SFARI)
 #Total 
 train_set$SFARI 
   FALSE  2872
   TRUE  2872
   #Total cases  5744

Labels distribution in test set

cro(test_set$SFARI)
 #Total 
 test_set$SFARI 
   FALSE  179
   TRUE  179
   #Total cases  358




Logistic Regression


Train model

train_set$SFARI = train_set$SFARI %>% as.factor

fit = glm(SFARI~., data=train_set, family='binomial')

Predict labels in test set

test_set$prob = predict(fit, newdata=test_set, type='response')
test_set$pred = test_set$prob>0.5


Performance metrics


Confusion matrix

conf_mat = test_set %>% apply_labels(SFARI = 'Actual Labels', 
                                     prob = 'Assigned Probability', 
                                     pred = 'Label Prediction')

cro(conf_mat$SFARI, list(conf_mat$pred, total()))
 Label Prediction     #Total 
 FALSE   TRUE   
 Actual Labels 
   FALSE  115 64   179
   TRUE  68 111   179
   #Total cases  183 175   358
rm(conf_mat)

Accuracy

acc = mean(test_set$SFARI==test_set$pred)

print(paste0('Accuracy = ', round(acc,4)))
## [1] "Accuracy = 0.6313"
rm(acc)

ROC Curve

pred_ROCR = prediction(test_set$prob, test_set$SFARI)

roc_ROCR = performance(pred_ROCR, measure='tpr', x.measure='fpr')
AUC = performance(pred_ROCR, measure='auc')@y.values[[1]]

plot(roc_ROCR, main=paste0('ROC curve (AUC=',round(AUC,2),')'), col='#009999')
abline(a=0, b=1, col='#666666')

Lift Curve

lift_ROCR = performance(pred_ROCR, measure='lift', x.measure='rpp')
plot(lift_ROCR, main='Lift curve', col='#86b300')

rm(pred_ROCR, roc_ROCR, AUC, lift_ROCR)




Analyse model


SFARI genes have a slightly higher score distribution than the rest

plot_data = test_set %>% dplyr::select(prob, SFARI)

plot_data %>% ggplot(aes(prob, fill=SFARI, color=SFARI)) + geom_density(alpha=0.3) + xlab('Score') +
              theme_minimal() + ggtitle('Model score distribution by SFARI Label')

  • There seems to be a positie relation between the SFARI scores and the probability assigned by the model

  • The number of observations when separating the test set by SFARI score is quite small, so this is not a robust result, specially for scores 1, 2 and 6

plot_data = test_set %>% mutate(ID=rownames(test_set)) %>% dplyr::select(ID, prob) %>%
            left_join(original_dataset %>% mutate(ID=rownames(original_dataset)), by='ID') %>%
            dplyr::select(ID, prob, gene.score) %>% apply_labels(gene.score='SFARI Gene score')

cro(plot_data$gene.score)
 #Total 
 SFARI Gene score 
   1  6
   2  8
   3  38
   4  91
   5  34
   6  2
   None  179
   #Total cases  358
mean_vals = plot_data %>% group_by(gene.score) %>% summarise(mean_prob = mean(prob))

# plot_data %>% ggplot(aes(prob, color=gene.score, fill=gene.score)) + geom_density(alpha=0.25) + 
#               geom_vline(data=mean_vals, aes(xintercept=mean_prob, color=gene.score), linetype='dashed') +
#               scale_colour_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) +
#               scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + 
#               ggtitle('Distribution of probabilities by SFARI score') +
#               xlab('Probability') + ylab('Density') + theme_minimal()

ggplotly(plot_data %>% ggplot(aes(gene.score, prob, fill=gene.score)) + geom_boxplot() + 
              scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + 
              ggtitle('Distribution of probabilities by SFARI score') +
              xlab('SFARI score') + ylab('Probability') + theme_minimal())
rm(mean_vals)

Summary of logistic regression results

summary(fit)
## 
## Call:
## glm(formula = SFARI ~ ., family = "binomial", data = train_set)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.03717  -1.07359  -0.00685   1.08550   2.01715  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.203176   0.052763  -3.851 0.000118 ***
## MTcor        0.007607   0.069039   0.110 0.912257    
## GS          -0.964971   0.729038  -1.324 0.185629    
## MM.B79F00    1.295107   0.736868   1.758 0.078819 .  
## MM.FE6E8A   -3.625643   2.130358  -1.702 0.088775 .  
## MM.00BF7D    1.845250   1.128473   1.635 0.102013    
## MM.D376FF   -3.644446   2.096124  -1.739 0.082095 .  
## MM.00B7E9   -2.224017   2.080739  -1.069 0.285133    
## MM.00BD5F   -1.219937   1.590910  -0.767 0.443190    
## MM.00BA38    0.239257   2.090744   0.114 0.908892    
## MM.FF62BC    6.438833   2.020567   3.187 0.001439 ** 
## MM.619CFF    0.105049   0.897008   0.117 0.906772    
## MM.EF7F49    1.497764   1.690607   0.886 0.375654    
## MM.F8766D    0.043667   1.578844   0.028 0.977935    
## MM.00C0AF    4.321865   1.859996   2.324 0.020148 *  
## MM.B983FF    0.484892   1.663971   0.291 0.770740    
## MM.F564E3   -5.070359   2.413282  -2.101 0.035639 *  
## MM.FD61D1   -3.018608   2.013575  -1.499 0.133840    
## MM.8AAB00   -3.549878   1.306679  -2.717 0.006593 ** 
## MM.C99800    2.166106   2.094533   1.034 0.301056    
## MM.00BFC4    0.437179   0.517349   0.845 0.398090    
## MM.E58700   -0.261907   0.782260  -0.335 0.737769    
## MM.FF67A4    0.054520   0.960200   0.057 0.954721    
## MM.00A7FF    0.308978   1.402287   0.220 0.825607    
## MM.6BB100    4.737267   2.408627   1.967 0.049207 *  
## MM.00C097   -0.807572   2.839628  -0.284 0.776109    
## MM.39B600   -0.382276   0.955236  -0.400 0.689017    
## MM.9590FF    0.040102   1.775416   0.023 0.981979    
## MM.A3A500   -2.638096   1.027704  -2.567 0.010259 *  
## MM.00BCD8    0.458600   0.428836   1.069 0.284886    
## MM.E76BF3    0.151504   0.746839   0.203 0.839244    
## MM.00B0F6    2.192038   1.868537   1.173 0.240743    
## MM.D89000    0.075189   0.751710   0.100 0.920326    
## absGS       -0.092173   0.140691  -0.655 0.512374    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 7962.9  on 5743  degrees of freedom
## Residual deviance: 7338.1  on 5710  degrees of freedom
## AIC: 7406.1
## 
## Number of Fisher Scoring iterations: 4


There doesn’t seem to be a relation between the modules’ coefficient and their correlation with Diagnosis, although there is correlation between the variables, so it’s not correct to analyse their coefficients when this happens

Also, this result changes a lot between different runs

var_fit_info = summary(fit)$coefficients %>% as.data.frame %>% 
               mutate(signif=`Pr(>|z|)`<0.05, ID=rownames(summary(fit)$coefficients))

plot_data = original_dataset %>% dplyr::select(Module, MTcor) 
colnames(plot_data) = c('ID','MTcor')

plot_data = plot_data %>% mutate(ID=gsub('#','MM.',ID)) %>% group_by(ID, MTcor) %>% 
            tally %>% inner_join(var_fit_info, by='ID')

ggplotly(plot_data %>% ggplot(aes(Estimate, MTcor, color=signif, size=n)) + geom_point(aes(id=ID)) + 
         geom_smooth(method='lm', se=FALSE) + theme_minimal() +
         xlab('Coefficient in regression') + ylab('Module-Diagnosis correlation'))
rm(var_fit_info)

There is strong correlations between variables

cors = cor(train_set[,-ncol(train_set)])
heatmap.2(cors, dendrogram='none', col=brewer.pal(11,'RdBu'), scale='none', trace='none')

rm(cors)


Genes with highest scores in test set


  • Most of the genes belong to the SFARI dataset

  • There’s not a single gene with a SFARI score of 6

test_set %>% dplyr::select(prob, SFARI) %>% mutate(ID = rownames(test_set)) %>% 
             arrange(desc(prob)) %>% top_n(50, wt=prob) %>%
             left_join(original_dataset %>% mutate(ID=rownames(original_dataset)), by='ID')  %>% 
             left_join(gene_names, by = c('ID'='ensembl_gene_id')) %>%
             dplyr::rename('GeneSymbol' = external_gene_id, 'Probability' = prob, 'ModuleDiagnosis_corr' = MTcor) %>%
             mutate(ModuleDiagnosis_corr = round(ModuleDiagnosis_corr,4), Probability = round(Probability,4)) %>%
             dplyr::select(GeneSymbol, gene.score, ModuleDiagnosis_corr, Module, Probability) %>%
             kable(caption = 'Genes with highest model probabilities from the test set')
Genes with highest model probabilities from the test set
GeneSymbol gene.score ModuleDiagnosis_corr Module Probability
GABBR2 4 -0.9514 #00C0AF 0.8591
ZNF804A 3 -0.6750 #D376FF 0.8454
ANK3 3 0.0586 #FE6E8A 0.8394
CSMD1 4 -0.9514 #00C0AF 0.8224
EP400 3 -0.6031 #00BA38 0.8145
NUP214 None 0.7916 #00C097 0.7995
POGZ 1 0.1127 #FF62BC 0.7978
WDFY3 2 -0.0094 #00A7FF 0.7976
DLGAP2 4 -0.9514 #00C0AF 0.7953
LRP2 4 -0.6750 #D376FF 0.7926
EPC2 4 0.7916 #00C097 0.7908
MAST4 None -0.9514 #00C0AF 0.7878
MDGA2 4 -0.9514 #00C0AF 0.7868
NOS1AP 5 -0.9514 #00C0AF 0.7825
BRD4 4 -0.6031 #00BA38 0.7733
ELAVL3 3 -0.6031 #00BA38 0.7721
SLITRK4 None -0.9514 #00C0AF 0.7715
ZNF318 None 0.7916 #00C097 0.7677
UNC80 4 -0.9514 #00C0AF 0.7653
AHDC1 3 -0.6031 #00BA38 0.7649
GABRA1 5 -0.9514 #00C0AF 0.7641
RAI1 3 0.1127 #FF62BC 0.7610
ITGAM None 0.7287 #39B600 0.7610
ADNP 1 0.1127 #FF62BC 0.7601
SLC7A5 3 0.7287 #39B600 0.7453
CNTNAP5 4 -0.9514 #00C0AF 0.7452
OTUD7A 3 -0.9514 #00C0AF 0.7380
ATP1A3 4 -0.9514 #00C0AF 0.7379
AFF2 4 0.0586 #FE6E8A 0.7374
SON None 0.1127 #FF62BC 0.7355
CHD8 1 0.7916 #00C097 0.7337
TECTA 4 0.4217 #00BCD8 0.7329
MYO16 4 -0.6750 #D376FF 0.7303
ATXN7 5 0.7916 #00C097 0.7230
NLGN2 4 -0.6031 #00BA38 0.7219
STAC2 None -0.9514 #00C0AF 0.7211
STXBP5 3 -0.9514 #00C0AF 0.7210
PTPRC 4 0.7287 #39B600 0.7175
KCND2 4 0.1127 #FF62BC 0.7158
CX3CR1 4 0.7287 #39B600 0.7143
GRIK3 4 -0.9514 #00C0AF 0.7133
ASPM 3 0.5885 #D89000 0.7133
SIPA1L1 None 0.7916 #00C097 0.7128
TRIP12 1 0.7916 #00C097 0.7113
PDP1 None -0.9514 #00C0AF 0.7094
AMBRA1 5 0.1127 #FF62BC 0.7090
MEF2C 4 -0.9514 #00C0AF 0.7030
KCND3 4 0.1127 #FF62BC 0.7013
SEZ6L2 4 -0.9514 #00C0AF 0.7006
NEGR1 4 -0.9514 #00C0AF 0.6975



Negative samples distribution

Running the model on all non-SFARI genes

negative_set = dataset %>% filter(!SFARI) %>% dplyr::select(-SFARI)
rownames(negative_set) = rownames(dataset)[!dataset$SFARI]

negative_set$prob = predict(fit, newdata=negative_set, type='response')
negative_set$pred = negative_set$prob>0.5

negative_set_table = negative_set %>% apply_labels(prob = 'Assigned Probability', 
                                                   pred = 'Label Prediction')
cro(negative_set_table$pred)
 #Total 
 Label Prediction 
   FALSE  9592
   TRUE  5545
   #Total cases  15137
cat(paste0('\n', sum(negative_set$pred), ' genes are predicted as ASD-related'))
## 
## 5545 genes are predicted as ASD-related
negative_set %>% ggplot(aes(prob)) + geom_density(color='#F8766D', fill='#F8766D', alpha=0.5) +
                 geom_vline(xintercept=0.5, color='#333333', linetype='dotted') + 
                 ggtitle('Probability distribution of all the Negative samples in the dataset') + 
                 theme_minimal()

Probability and Gene Significance

There’s a lot of noise, but the genes with the highest probabilities have slightly higher (absolute) Gene Significance

negative_set %>% ggplot(aes(prob, GS, color=MTcor)) + geom_point() + geom_smooth(method='loess', color='#666666') +
                 geom_hline(yintercept=0, color='gray', linetype='dashed') +
                 scale_color_gradientn(colours=c('#F8766D','white','#00BFC4')) + 
                 ggtitle('Relation between Probability and Gene Significance') + theme_minimal()

negative_set %>% ggplot(aes(prob, abs(GS), color=MTcor)) + geom_point() + 
                 geom_hline(yintercept=mean(negative_set$absGS), color='gray', linetype='dashed') + 
                 geom_smooth(method='loess', color='#666666') +
                 scale_color_gradientn(colours=c('#F8766D','white','#00BFC4')) + 
                 ggtitle('Relation between Model probability and Gene Significance') + theme_minimal()


Probability and Module-Diagnosis correlation

On average, the model seems to be assigning a probability inversely proportional to the Module-Diagnosis correlation of the module, with the highest positively correlated modules having the lowest average probability and the highest negatively correlated modules the highest average probability. But the difference isn’t big

negative_set %>% ggplot(aes(MTcor, prob, color=GS)) + geom_point() + geom_smooth(method='loess', color='#666666') + 
                 geom_hline(yintercept=mean(negative_set$prob), color='gray', linetype='dashed') +
                 scale_color_gradientn(colours=c('#F8766D','#F8766D','white','#00BFC4','#00BFC4')) + 
                 xlab('Modules ordered by their correlation to ASD') + ylab('Model probability') +
                 theme_minimal()

Summarised version, plotting by module instead of by gene

The difference in the trend lines between this plot and the one above is that the one above takes all the points into consideration while this considers each module as an observation by itself, so the top one is strongly affected by big modules and the bottom one treats all modules the same

The model seems to give higher probabilities to genes belonging to modules with a small (absolute) correlation to Diagnosis, although the difference isn’t much

plot_data = negative_set %>% group_by(MTcor) %>% summarise(mean = mean(prob), sd = sd(prob), n = n()) %>%
            mutate(MTcor_sign = ifelse(MTcor>0, 'Positive', 'Negative')) %>% left_join(original_dataset, by='MTcor') %>%
            dplyr::select(Module, MTcor, MTcor_sign, mean, sd, n) %>% distinct()
colnames(plot_data)[1] = 'ID'

ggplotly(plot_data %>% ggplot(aes(MTcor, mean, size=n, color=MTcor_sign)) + geom_point(aes(id=ID)) + 
         geom_smooth(method='loess', color='gray', se=FALSE) + geom_smooth(method='lm', se=FALSE) + 
         xlab('Module-Diagnosis correlation') + ylab('Mean Probability by Model') + theme_minimal())

Probability and level of expression

There is a positive relation between level of expression and probability, the model seems to be capturing indirectly the level of expression of the genes to make the prediction, so it’s introducing the same bias

# Gandal dataset
load('./../Data/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
DE_info = DE_info %>% data.frame
mean_and_sd = data.frame(ID=rownames(datExpr), meanExpr=rowMeans(datExpr), sdExpr=apply(datExpr,1,sd))

plot_data = negative_set %>% mutate(ID=rownames(negative_set)) %>% left_join(mean_and_sd, by='ID') %>% 
            left_join(original_dataset %>% mutate(ID=rownames(original_dataset)) %>% 
                      dplyr::select(ID, Module), by='ID')
colnames(plot_data)[ncol(plot_data)] = 'Module'

plot_data %>% ggplot(aes(meanExpr, prob)) + geom_point(alpha=0.2, color='#0099cc') + 
              geom_smooth(method='loess', color='gray', alpha=0.3) + 
              geom_smooth(method='lm', color='#999999', se=FALSE, alpha=1) + 
              theme_minimal() + ggtitle('Mean expression vs model probability by gene')

rm(mean_and_sd)
plot_data2 = plot_data %>% group_by(Module) %>% summarise(meanExpr = mean(meanExpr), meanProb = mean(prob), n=n())

ggplotly(plot_data2 %>% ggplot(aes(meanExpr, meanProb, size=n)) + geom_point(color=plot_data2$Module) + 
         geom_smooth(method='loess', se=TRUE, color='gray', alpha=0.1, size=0.7) + 
         geom_smooth(method='lm', se=FALSE, color='gray') + theme_minimal() + theme(legend.position='none') + 
         ggtitle('Mean expression vs model probability by Module'))
rm(plot_data2)

Probability and level of expression

There is also a positive relation between the standard deviation of a gene and its regression score, the model could be capturing this characteristic of the genes to make the prediction, and could be introducing bias

plot_data %>% filter(sdExpr<0.5) %>% ggplot(aes(sdExpr, prob)) + geom_point(alpha=0.1, color='#0099cc') + 
              geom_smooth(method='loess', color='gray', alpha=0.2) + 
              geom_smooth(method='lm', color='#999999', se=FALSE, alpha=1) + 
              theme_minimal() + ggtitle('SD vs model probability by gene')

This approximation curve looks like the opposite of the trend found between mean/sd and model scores

plot_data %>% ggplot(aes(meanExpr, sdExpr)) + geom_point(alpha=0.1, color='#0099cc') + 
              geom_smooth(method='loess', color='gray', alpha=0.3) + 
              geom_smooth(method='lm', color='#999999', se=FALSE, alpha=1) + 
              scale_x_log10() + scale_y_log10() +
              theme_minimal() + ggtitle('Mean expression vs SD by gene')

Probability and lfc

There is a relation between probability and lfc, so it IS capturing a bit of true information (because lfc and mean expression were negatively correlated and it still has a positive relation in the model)

plot_data = negative_set %>% mutate(ID=rownames(negative_set)) %>% 
            left_join(DE_info %>% mutate(ID=rownames(DE_info)), by='ID')

plot_data %>% filter(abs(log2FoldChange)<10) %>%
              ggplot(aes(log2FoldChange, prob)) + geom_point(alpha=0.1, color='#0099cc') + 
              geom_smooth(method='loess', color='gray', alpha=0.3) + 
              theme_minimal() + ggtitle('lfc vs model probability by gene')



Conclusion

The model is capturing the mean level of expression of the genes (indirectly through module memberhsip), which is a strong bias found in the SFARI scores, but it seems to be capturing a bit of true biological signal as well (based on the GS and the log fold change plots)


Saving results

save(train_set, test_set, negative_set, fit, dataset, file='./../Data/LR_model.RData')




Session info

sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] expss_0.10.2       ROCR_1.0-7         gplots_3.0.3       Rtsne_0.15        
##  [5] biomaRt_2.40.5     RColorBrewer_1.1-2 gridExtra_2.3      viridis_0.5.1     
##  [9] viridisLite_0.3.0  plotly_4.9.2       knitr_1.28         forcats_0.5.0     
## [13] stringr_1.4.0      dplyr_0.8.5        purrr_0.3.3        readr_1.3.1       
## [17] tidyr_1.0.2        tibble_3.0.0       ggplot2_3.3.0      tidyverse_1.3.0   
## 
## loaded via a namespace (and not attached):
##   [1] colorspace_1.4-1            ellipsis_0.3.0             
##   [3] htmlTable_1.13.3            XVector_0.24.0             
##   [5] base64enc_0.1-3             GenomicRanges_1.36.1       
##   [7] fs_1.4.0                    rstudioapi_0.11            
##   [9] farver_2.0.3                bit64_0.9-7                
##  [11] AnnotationDbi_1.46.1        fansi_0.4.1                
##  [13] lubridate_1.7.4             xml2_1.2.5                 
##  [15] splines_3.6.3               geneplotter_1.62.0         
##  [17] Formula_1.2-3               jsonlite_1.6.1             
##  [19] annotate_1.62.0             broom_0.5.5                
##  [21] cluster_2.1.0               dbplyr_1.4.2               
##  [23] png_0.1-7                   shiny_1.4.0.2              
##  [25] compiler_3.6.3              httr_1.4.1                 
##  [27] backports_1.1.5             fastmap_1.0.1              
##  [29] assertthat_0.2.1            Matrix_1.2-18              
##  [31] lazyeval_0.2.2              cli_2.0.2                  
##  [33] later_1.0.0                 acepack_1.4.1              
##  [35] htmltools_0.4.0             prettyunits_1.1.1          
##  [37] tools_3.6.3                 gtable_0.3.0               
##  [39] glue_1.3.2                  GenomeInfoDbData_1.2.1     
##  [41] Rcpp_1.0.4                  Biobase_2.44.0             
##  [43] cellranger_1.1.0            vctrs_0.2.4                
##  [45] gdata_2.18.0                nlme_3.1-144               
##  [47] crosstalk_1.1.0.1           xfun_0.12                  
##  [49] rvest_0.3.5                 mime_0.9                   
##  [51] miniUI_0.1.1.1              lifecycle_0.2.0            
##  [53] gtools_3.8.2                XML_3.99-0.3               
##  [55] zlibbioc_1.30.0             scales_1.1.0               
##  [57] promises_1.1.0              hms_0.5.3                  
##  [59] parallel_3.6.3              SummarizedExperiment_1.14.1
##  [61] yaml_2.2.1                  curl_4.3                   
##  [63] memoise_1.1.0               rpart_4.1-15               
##  [65] ggExtra_0.9                 latticeExtra_0.6-29        
##  [67] stringi_1.4.6               RSQLite_2.2.0              
##  [69] genefilter_1.66.0           highr_0.8                  
##  [71] S4Vectors_0.22.1            checkmate_2.0.0            
##  [73] caTools_1.18.0              BiocGenerics_0.30.0        
##  [75] BiocParallel_1.18.1         GenomeInfoDb_1.20.0        
##  [77] rlang_0.4.5                 pkgconfig_2.0.3            
##  [79] bitops_1.0-6                matrixStats_0.56.0         
##  [81] evaluate_0.14               lattice_0.20-40            
##  [83] htmlwidgets_1.5.1           labeling_0.3               
##  [85] bit_1.1-15.2                tidyselect_1.0.0           
##  [87] magrittr_1.5                DESeq2_1.24.0              
##  [89] R6_2.4.1                    IRanges_2.18.3             
##  [91] generics_0.0.2              Hmisc_4.4-0                
##  [93] DelayedArray_0.10.0         DBI_1.1.0                  
##  [95] mgcv_1.8-31                 pillar_1.4.3               
##  [97] haven_2.2.0                 foreign_0.8-75             
##  [99] withr_2.1.2                 nnet_7.3-13                
## [101] survival_3.1-11             RCurl_1.98-1.1             
## [103] modelr_0.1.6                crayon_1.3.4               
## [105] KernSmooth_2.23-16          rmarkdown_2.1              
## [107] jpeg_0.1-8.1                progress_1.2.2             
## [109] locfit_1.5-9.4              grid_3.6.3                 
## [111] readxl_1.3.1                data.table_1.12.8          
## [113] blob_1.2.1                  reprex_0.3.0               
## [115] digest_0.6.25               xtable_1.8-4               
## [117] httpuv_1.5.2                stats4_3.6.3               
## [119] munsell_0.5.0